Handmade Markov Chain

We are given the following un-normalized posterior distribution

\[ g(\theta \: \vert \: x) \sim \frac{1}{2} \exp\biggl[-\frac{(\theta + 3)^2}{2}\biggr] + \frac{1}{2} \exp\biggl[-\frac{(\theta - 3)^2}{2}\biggr], \]

and we want to draw a Markov Chain from it, using a Metropolis-Hastings algorithm.

Let’s define the functions in R. We’ll choose a gaussian proposal distribution, with the standard deviation given as input from the user.

posterior <- function(t) {
    0.5 * exp(-0.5 * (t + 3)^2) + 0.5 * exp(-0.5 * (t - 3)^2)
}

metropolis <- function(target, proposal_std, niter, burn_in, thinning) {
    chain <- numeric(niter)
    x_old <- 0 # trial move
    accepted <- 0

    for (i in seq_along(chain)) {
        # generate trial move
        x_new <- rnorm(1, x_old, proposal_std)

        # accept with metropolis test
        acc_prob <- min(1, target(x_new) / target(x_old))
        if (runif(1) < acc_prob) {
            x_old <- x_new
            accepted <- accepted + 1
        }

        # store result
        chain[i] <- x_old
    }

    # discard burn-in and thin
    chain <- chain[seq(burn_in, length(chain), by = thinning)]

    list(chain = chain, acc_rate = accepted / niter)
}

Let’s test the algorithm and display the results in a plot.

std <- 1
niter <- 1e5
burn_in <- 1000
thinning <- 10

mc <- metropolis(posterior, std, niter, burn_in, thinning)

cat("Acceptance rate =", mc$acc_rate)
## Acceptance rate = 0.71051
true_func <- tibble(
    theta = seq(-8, 8, by = 0.01),
    # we've to normalize the posterior here
    post = posterior(theta) / integrate(posterior, -8, 8)$value
)

tibble(chain = mc$chain) |>
    ggplot(aes(x = chain)) +
        geom_histogram(
            aes(y = after_stat(density)),
            binwidth = 0.2, fill = "#e6a452"
        ) +
        geom_line(
            aes(x = theta, y = post),
            data = true_func,
            colour = "#945a10", linewidth = 0.8
        ) +
        labs(x = "θ", y = "Posterior g(θ | x)")

The agreement is good. Now, let’s analyse the chain with the CODA package and plot the chain autocorrelation.

library(coda)

chain <- as.mcmc(mc$chain)

tibble(lag = seq(0, 500, by = 10), acor = autocorr(chain, lags = lag)) |>
    ggplot(aes(lag, acor)) +
        geom_hline(yintercept = 0, colour = "gray40", linetype = 2) +
        geom_point(size = 2, colour = "#7b7bb4") +
        geom_line(linewidth = 0.8, colour = "#7b7bb4") +
        labs(x = "Lag", y = "ACF")

Let’s experiment with changing the burn-in cycle’s size, to see if there’s perhaps a better value than the one we have chosen initially, 1000. To analyse the performance, we’ll monitor both the acceptance rate and the effective size returned by CODA.

purrr::map(
    c(500, 750, 1000, 1500, 2000, 4000, 8000),
    function(b) {
        res <- metropolis(posterior, std, niter, b, thinning)
        tibble(
            burn_in = b,
            acc_rate = res$acc_rate,
            eff_size = effectiveSize(res$chain)
        )
    }
) |> list_rbind() |> kbl() |> kable_styling(full_width = FALSE)
burn_in acc_rate eff_size
500 0.71166 352.7936
750 0.71200 330.7310
1000 0.71296 348.5090
1500 0.71053 275.7466
2000 0.70968 319.9463
4000 0.71041 267.4020
8000 0.70953 283.5120

The effective size is quite low with every choice of burn_in, and doesn’t change much. We’ll stick with the initial choice of 1000 for simplicity.

We can also try different values for the proposal distribution’s standard deviation and the thinning interval. The acceptance rate will only be affected by the standard deviation value, so we can check it separately, keeping the previous thinning value of 10.

purrr::map(
    c(0.5, 0.75, 1, 1.25, 1.5),
    function(s) {
        res <- metropolis(posterior, s, niter, burn_in, thinning)
        tibble(
            std = s,
            acc_rate = res$acc_rate,
            eff_size = effectiveSize(res$chain)
        )
    }
) |> list_rbind() |> kbl() |> kable_styling(full_width = FALSE)
std acc_rate eff_size
0.50 0.84491 75.42288
0.75 0.77388 161.38300
1.00 0.71308 289.66774
1.25 0.65467 581.91431
1.50 0.60676 971.87233

There’s a clear trade-off between the effective size and the acceptance rate: a wider proposal distribution leads to more uncorrelated samples, but it’s also “riskier”, lowering the acceptance rate. Let’s analyse the effective size to choose the best parameters.

get_eff_size <- function(s, t) {
    effectiveSize(metropolis(posterior, s, niter, burn_in, t)$chain)
}

expand_grid(
    std = c(0.5, 0.75, 1, 1.25, 1.5),
    thinning = c(5, 10, 20, 40, 80)
) |>
    mutate(eff_size = purrr::map2_dbl(std, thinning, get_eff_size)) |>
    ggplot(aes(factor(std), factor(thinning))) +
        geom_tile(aes(fill = eff_size)) +
        scale_fill_viridis_c(name = "Effective size") +
        labs(x = "Proposal standard deviation", y = "Thinning factor")

We can see that the thinning interval isn’t affecting the effective size as much as the standard deviation of the proposal distribution. Probably the most sensible thing to do, given these results, is to raise the standard deviation a bit, to 1.25 or 1.5. The other parameters seem good enough with the chosen values.

COVID-19 vaccines efficacy

We’ll analyse the initial test data reported on the European Medicines Agency website for the following early vaccines:

We’ll do a bayesian analysis to determine the vaccine efficacy, defined as

\[ E = \frac{R_\mathrm{p} - R_\mathrm{v}}{R_\mathrm{p}}, \]

where \(R_\mathrm{p}\) is the rate of infection in a control (placebo) group and \(R_\mathrm{v}\) the rate of infection in a vaccinated group.

Let’s define a general function, and then proceed with the analysis of each vaccine. The process is binomial, and as a prior we can choose a Beta(3, 100), with an expected value for the infection frequency in line with what was found during the COVID pandemic.

vaxtest <- function(tot_placebo, tot_vaccine, pos_placebo, pos_vaccine) {
    # total number of positives and negatives
    pos <- pos_vaccine + pos_placebo
    neg <- tot_vaccine + tot_placebo - pos_vaccine - pos_placebo

    # construct data list with test results and associated groups
    data <- list(
        test = c(rep(0, neg), rep(1, pos)),
        group = c(
            # negatives
            rep(1, tot_placebo - pos_placebo),
            rep(2, tot_vaccine - pos_vaccine),
            # positives
            rep(1, pos_placebo),
            rep(2, pos_vaccine)
        )
    )

    # jags model string
    model <- "
        model {
            for (i in 1:length(test)) {
                test[i] ~ dbern(theta[group[i]])
            }

            theta[1] ~ dbeta(3, 100) # placebo prior
            theta[2] ~ dbeta(3, 100) # vaccine prior
        }
    "

    # build and run the model with jags (4 parallel chains)
    runjags::run.jags(
        model = model,
        data = data,
        sample = 15000,
        burnin = 2000,
        n.chains = 4,
        method = "parallel",
        monitor = "theta"
    ) |>
        # tidy up the result in a tibble
        tidybayes::tidy_draws() |>
        dplyr::transmute(
            placebo = 100 * `theta[1]`,
            vaccine = 100 * `theta[2]`,
            efficacy = 100 * (1 - `theta[2]` / `theta[1]`)
        ) |>
        coda::as.mcmc()
}

Let’s also define a plotting function, using the package bayesplot.

vaxplot <- function(chain, pars) {
    bayesplot::mcmc_areas(
        chain, pars = pars, prob = 0.95, point_est = "mean"
    ) +
        ggplot2::labs(
            title = "Posterior distribution",
            subtitle = "with mean and 95% credibility interval",
            x = "Percentage"
        ) +
        bayesplot::theme_default(base_size = 14, base_family = "Open Sans")
}

Janssen’s vaccine

Let’s start with Janssen’s vaccine, “Jcovden”. According to the EMA,

The trial found a 67% reduction in the number of symptomatic COVID-19 cases after 2 weeks in people who received Jcovden (116 cases out of 19,630 people) compared with people given placebo (348 of 19,691 people). This means that the vaccine had a 67% efficacy.

janssen <- vaxtest(19691, 19630, 348, 116)
## Calling 4 simulations using the parallel method...
## Following the progress of chain 1 (the program will wait for all chains
## to finish before continuing):
## Welcome to JAGS 4.3.2 on Sun Jun  4 17:54:06 2023
## JAGS is free software and comes with ABSOLUTELY NO WARRANTY
## Loading module: basemod: ok
## Loading module: bugs: ok
## . . Reading data file data.txt
## . Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 39321
##    Unobserved stochastic nodes: 2
##    Total graph size: 78646
## . Reading parameter file inits1.txt
## . Initializing model
## . Adaptation skipped: model is not in adaptive mode.
## . Updating 2000
## -------------------------------------------------| 2000
## ************************************************** 100%
## . . Updating 15000
## -------------------------------------------------| 15000
## ************************************************** 100%
## . . . . Updating 0
## . Deleting model
## All chains have finished
## Note: the model did not require adaptation
## Simulation complete.  Reading coda files...
## Coda files loaded successfully
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 2 variables....
## Finished running the simulation
summary(janssen)
## 
## Iterations = 1:60000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 60000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean      SD  Naive SE Time-series SE
## placebo   1.7737 0.09390 0.0003833      0.0003833
## vaccine   0.6031 0.05518 0.0002253      0.0002253
## efficacy 65.9001 3.60967 0.0147364      0.0147364
## 
## 2. Quantiles for each variable:
## 
##             2.5%     25%     50%     75%   97.5%
## placebo   1.5951  1.7097  1.7717  1.8357  1.9633
## vaccine   0.4991  0.5652  0.6018  0.6392  0.7161
## efficacy 58.4104 63.5674 66.0575 68.4099 72.5316

Now, let’s plot the infection rate posteriors for both the control group and the vaccinated group.

vaxplot(janssen, pars = c("placebo", "vaccine"))

And finally, the efficacy.

vaxplot(janssen, pars = "efficacy")

Moderna’s vaccine

Now we move to Moderna’s vaccine, called “Pikevax”. The EMA states

The trial showed a 94.1% reduction in the number of symptomatic COVID-19 cases in the people who received the vaccine (11 out of 14,134 vaccinated people got COVID-19 with symptoms) compared with people who received dummy injections (185 out of 14,073 people who received dummy injections got COVID-19 with symptoms). This means that the vaccine demonstrated a 94.1% efficacy in the trial.

moderna <- vaxtest(14073, 14134, 185, 11)
## Calling 4 simulations using the parallel method...
## Following the progress of chain 1 (the program will wait for all chains
## to finish before continuing):
## Welcome to JAGS 4.3.2 on Sun Jun  4 17:56:12 2023
## JAGS is free software and comes with ABSOLUTELY NO WARRANTY
## Loading module: basemod: ok
## Loading module: bugs: ok
## . . Reading data file data.txt
## . Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 28207
##    Unobserved stochastic nodes: 2
##    Total graph size: 56418
## . Reading parameter file inits1.txt
## . Initializing model
## . Adaptation skipped: model is not in adaptive mode.
## . Updating 2000
## -------------------------------------------------| 2000
## ************************************************** 100%
## . . Updating 15000
## -------------------------------------------------| 15000
## ************************************************** 100%
## . . . . Updating 0
## . Deleting model
## . 
## All chains have finished
## Note: the model did not require adaptation
## Simulation complete.  Reading coda files...
## Coda files loaded successfully
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 2 variables....
## Finished running the simulation
summary(moderna)
## 
## Iterations = 1:60000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 60000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean      SD  Naive SE Time-series SE
## placebo   1.3254 0.09625 0.0003929      0.0003952
## vaccine   0.0985 0.02635 0.0001076      0.0001076
## efficacy 92.5287 2.07946 0.0084894      0.0084894
## 
## 2. Quantiles for each variable:
## 
##              2.5%      25%      50%     75%   97.5%
## placebo   1.14438  1.25894  1.32282  1.3888  1.5213
## vaccine   0.05383  0.07978  0.09603  0.1146  0.1562
## efficacy 87.92363 91.26507 92.74667 94.0134 95.9815

And the plots like before.

vaxplot(moderna, pars = c("placebo", "vaccine"))

vaxplot(moderna, pars = "efficacy")

AstraZeneca’s vaccine

Finally, the “Vaxzeviria” from AstraZeneca. The EMA says

These showed a 59.5% reduction in the number of symptomatic COVID-19 cases in people given the vaccine (64 of 5,258 got COVID-19 with symptoms) compared with people given control injections (154 of 5,210 got COVID-19 with symptoms). This means that the vaccine demonstrated around a 60% efficacy in these clinical trials.

astrazeneca <- vaxtest(5210, 5258, 154, 64)
## Calling 4 simulations using the parallel method...
## Following the progress of chain 1 (the program will wait for all chains
## to finish before continuing):
## Welcome to JAGS 4.3.2 on Sun Jun  4 17:57:44 2023
## JAGS is free software and comes with ABSOLUTELY NO WARRANTY
## Loading module: basemod: ok
## Loading module: bugs: ok
## . . Reading data file data.txt
## . Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10468
##    Unobserved stochastic nodes: 2
##    Total graph size: 20940
## . Reading parameter file inits1.txt
## . Initializing model
## . Adaptation skipped: model is not in adaptive mode.
## . Updating 2000
## -------------------------------------------------| 2000
## ************************************************** 100%
## . . Updating 15000
## -------------------------------------------------| 15000
## ************************************************** 100%
## . . . . Updating 0
## . Deleting model
## All chains have finished
## Note: the model did not require adaptation
## Simulation complete.  Reading coda files...
## Coda files loaded successfully
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 2 variables....
## Finished running the simulation
summary(astrazeneca)
## 
## Iterations = 1:60000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 60000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##            Mean     SD Naive SE Time-series SE
## placebo   2.955 0.2320 0.000947      0.0009470
## vaccine   1.249 0.1514 0.000618      0.0006136
## efficacy 57.458 6.1450 0.025087      0.0255625
## 
## 2. Quantiles for each variable:
## 
##            2.5%    25%    50%    75%  97.5%
## placebo   2.516  2.795  2.949  3.108  3.427
## vaccine   0.970  1.145  1.243  1.348  1.563
## efficacy 44.366 53.593 57.834 61.784 68.358
vaxplot(astrazeneca, pars = c("placebo", "vaccine"))

vaxplot(astrazeneca, pars = "efficacy")

COVID-19 charts

We’ll analyse the global vaccination dataset provided by “Our World in Data”, available at https://ourworldindata.org/covid-vaccinations.

data <- read_csv("owid-covid-data.csv")

The CSV file follows a format of one row per location and date, keeping track of many variables:

spec(data)
## cols(
##   iso_code = col_character(),
##   continent = col_character(),
##   location = col_character(),
##   date = col_date(format = ""),
##   total_cases = col_double(),
##   new_cases = col_double(),
##   new_cases_smoothed = col_double(),
##   total_deaths = col_double(),
##   new_deaths = col_double(),
##   new_deaths_smoothed = col_double(),
##   total_cases_per_million = col_double(),
##   new_cases_per_million = col_double(),
##   new_cases_smoothed_per_million = col_double(),
##   total_deaths_per_million = col_double(),
##   new_deaths_per_million = col_double(),
##   new_deaths_smoothed_per_million = col_double(),
##   reproduction_rate = col_double(),
##   icu_patients = col_double(),
##   icu_patients_per_million = col_double(),
##   hosp_patients = col_double(),
##   hosp_patients_per_million = col_double(),
##   weekly_icu_admissions = col_double(),
##   weekly_icu_admissions_per_million = col_double(),
##   weekly_hosp_admissions = col_double(),
##   weekly_hosp_admissions_per_million = col_double(),
##   total_tests = col_double(),
##   new_tests = col_double(),
##   total_tests_per_thousand = col_double(),
##   new_tests_per_thousand = col_double(),
##   new_tests_smoothed = col_double(),
##   new_tests_smoothed_per_thousand = col_double(),
##   positive_rate = col_double(),
##   tests_per_case = col_double(),
##   tests_units = col_character(),
##   total_vaccinations = col_double(),
##   people_vaccinated = col_double(),
##   people_fully_vaccinated = col_double(),
##   total_boosters = col_double(),
##   new_vaccinations = col_double(),
##   new_vaccinations_smoothed = col_double(),
##   total_vaccinations_per_hundred = col_double(),
##   people_vaccinated_per_hundred = col_double(),
##   people_fully_vaccinated_per_hundred = col_double(),
##   total_boosters_per_hundred = col_double(),
##   new_vaccinations_smoothed_per_million = col_double(),
##   new_people_vaccinated_smoothed = col_double(),
##   new_people_vaccinated_smoothed_per_hundred = col_double(),
##   stringency_index = col_double(),
##   population_density = col_double(),
##   median_age = col_double(),
##   aged_65_older = col_double(),
##   aged_70_older = col_double(),
##   gdp_per_capita = col_double(),
##   extreme_poverty = col_double(),
##   cardiovasc_death_rate = col_double(),
##   diabetes_prevalence = col_double(),
##   female_smokers = col_double(),
##   male_smokers = col_double(),
##   handwashing_facilities = col_double(),
##   hospital_beds_per_thousand = col_double(),
##   life_expectancy = col_double(),
##   human_development_index = col_double(),
##   population = col_double(),
##   excess_mortality_cumulative_absolute = col_double(),
##   excess_mortality_cumulative = col_double(),
##   excess_mortality = col_double(),
##   excess_mortality_cumulative_per_million = col_double()
## )

The first thing that jumps out from a quick analysis of the iso_code and location columns is that, among all the countries and territories, there are also some “grouping” rows, e.g. low income or high income countries, or continents.

data |>
    select(iso_code, location) |>
    filter(str_starts(iso_code, "OWID")) |>
    distinct()
## # A tibble: 18 × 2
##    iso_code location           
##    <chr>    <chr>              
##  1 OWID_AFR Africa             
##  2 OWID_ASI Asia               
##  3 OWID_ENG England            
##  4 OWID_EUR Europe             
##  5 OWID_EUN European Union     
##  6 OWID_HIC High income        
##  7 OWID_KOS Kosovo             
##  8 OWID_LIC Low income         
##  9 OWID_LMC Lower middle income
## 10 OWID_NAM North America      
## 11 OWID_CYN Northern Cyprus    
## 12 OWID_NIR Northern Ireland   
## 13 OWID_OCE Oceania            
## 14 OWID_SCT Scotland           
## 15 OWID_SAM South America      
## 16 OWID_UMC Upper middle income
## 17 OWID_WLS Wales              
## 18 OWID_WRL World

So, we could directly work with the OWID_WRL rows, and get the whole-world data. We’ll later make sure that it checks out if we take the longer route of putting together all the rows pertaining to real single locations.

We’ll start with the vaccination data: let’s make a quick plot of the cumulative number of people who received at least one vaccine dose, per hundred (we find this in the people_vaccinated_per_hundred column).

data |>
    filter(location == "World") |>
    select(date, people_vaccinated_per_hundred) |>
    drop_na() |>
    ggplot(aes(date, people_vaccinated_per_hundred)) +
        geom_area(
            linewidth = 0.8, alpha = 0.5,
            colour = my_pal[1], fill = my_pal[1]
        ) +
        xlab("Date") +
        ylab("Vaccinated population (%)")

Let’s see if we get the same plot by grouping the locations.

data_no_owid <- data |> filter(!str_starts(iso_code, "OWID"))

data_no_owid |>
    drop_na(people_vaccinated, population) |>
    group_by(date) |>
    summarize(pvac = 100 * sum(people_vaccinated) / sum(population)) |>
    ggplot(aes(date, pvac)) +
        geom_point(size = 0.8, alpha = 0.5) +
        geom_smooth(
            span = 0.25, linewidth = 0.8,
            colour = my_pal[2], fill = my_pal[2], alpha = 0.5
        ) +
        xlab("Date") +
        ylab("Vaccinated population (%)")

As we can see, the trend is indeed similar. We can’t say much about the discrepancies, not knowing how the OWID_WRL data is constructed.

Let’s examine now the distribution of new doses administered by day, smoothed over a seven-day period. We can find this data in the column new_vaccinations_smoothed.

data_no_owid |>
    drop_na(new_vaccinations_smoothed, population) |>
    group_by(date) |>
    summarize(
        nvac = 100 * sum(new_vaccinations_smoothed) / sum(population)
    ) |>
    ggplot(aes(date, nvac)) +
        geom_area(
            linewidth = 0.8, alpha = 0.5,
            fill = my_pal[1], colour = my_pal[1]
        ) +
        xlab("Date") +
        ylab("New vaccinations (% of total population)")

We can also try a more interesting analysis by looking at the “Low/Middle/High Income” rows in the dataset, to see how higher income countries got hold of the vaccines sooner than the lower income ones. To avoid the double peaks of the second doses, we’ll look at the variable new_people_vaccinated_smoothed, which filters out the administrations of second or successive doses.

data |>
    filter(str_ends(location, "income")) |>
    drop_na(new_people_vaccinated_smoothed) |>
    select(date, location, vac = new_people_vaccinated_smoothed) |>
    mutate(location = fct_relevel(
        location,
        "High income",
        "Upper middle income",
        "Lower middle income",
        "Low income"
    )) |>
    ggplot(aes(date, vac)) +
        geom_area(
            aes(date, total),
            data = function(x) {
                x |> group_by(date) |>
                     summarize(total = sum(vac))
            }, colour = "grey40", fill = "grey80", alpha = 0.5
        ) +
        geom_area(
            aes(colour = location, fill = location),
            alpha = 0.5, position = "identity"
        ) +
        scale_y_continuous(
            labels = scales::label_number(suffix = "M", scale = 1e-6)
        ) +
        scale_colour_viridis_d(option = "plasma", guide = "none") +
        scale_fill_viridis_d(option = "plasma") +
        labs(
            x = "Date",
            y = "First doses (7-day smoothed)",
            fill = "Country income"
        )

Let’s look at the death count now. First, the cumulative total.

data_no_owid |>
    drop_na(total_deaths) |>
    group_by(date) |>
    summarize(deaths = sum(total_deaths)) |>
    ggplot(aes(date, deaths)) +
        geom_area(
            linewidth = 0.8, alpha = 0.5,
            colour = my_pal[3], fill = my_pal[3]
        ) +
        scale_y_continuous(
            breaks = scales::pretty_breaks(),
            labels = scales::label_number(suffix = "M", scale = 1e-6),
            limits = c(0, 8e6)
        ) +
        labs(x = "Date", y = "Total confirmed deaths")

And then, the weekly average (from the column new_deaths_smoothed).

data_no_owid |>
    drop_na(new_deaths_smoothed) |>
    group_by(date) |>
    summarize(deaths = sum(new_deaths_smoothed)) |>
    ggplot(aes(date, deaths)) +
        geom_area(
            linewidth = 0.8, alpha = 0.5,
            colour = my_pal[3], fill = my_pal[3]
        ) +
        scale_y_continuous(
            breaks = scales::pretty_breaks(),
            labels = scales::label_number(suffix = "k", scale = 1e-3)
        ) +
        labs(x = "Date", y = "Daily confirmed deaths (7-day smoothed)")